We are going to first import the merged counts tables that is outputted from the Nextflow pipeline. This table already has the raw counts summarised from the alignments and merged into a single matrix.
# Import CD4 counts
counts = readRDS('../data/ITO66-counts.rda')
head(counts)
## Bcells_alone_1 Bcells_alone_2 Bcells_cntrl Bcells_Tcells_1
## KLHL17_001 189 160 188 107
## AGRN_001 505 1499 1379 1167
## B3GALT6_001 301 330 255 179
## DVL1_001 232 299 344 217
## DVL1_002 137 77 125 90
## DVL1_003 232 108 145 68
## Bcells_Tcells_2 Bcells_Tcells_3 PlasmidDNA
## KLHL17_001 130 35 3
## AGRN_001 1038 1167 2852
## B3GALT6_001 208 85 53
## DVL1_001 299 44 38
## DVL1_002 123 18 6
## DVL1_003 83 45 31
# Import CD4 metadata
metadata = readRDS('../data/ITO66-metadata.rda')
head(metadata)
## Sample Group
## Bcells_alone_1 Bcells_alone_1 Bcell
## Bcells_alone_2 Bcells_alone_2 Bcell
## Bcells_Tcells_1 Bcells_Tcells_1 BTcell
## Bcells_Tcells_2 Bcells_Tcells_2 BTcell
## Bcells_Tcells_3 Bcells_Tcells_3 BTcell
## Bcells_cntrl Bcells_cntrl Bcell
# Import MultiQC results
report = readRDS('../data/ITO66-report.rda')
head(report)
## # A tibble: 6 × 47
## Sample raw_total_seque… filtered_sequen… sequences is_sorted `1st_fragments`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bcells_… 615906 0 615906 1 615906
## 2 Bcells_… 590307 0 590307 1 590307
## 3 Bcells_… 438168 0 438168 1 438168
## 4 Bcells_… 332876 0 332876 1 332876
## 5 Bcells_… 865567 0 865567 1 865567
## 6 Bcells_… 784799 0 784799 1 784799
## # … with 41 more variables: last_fragments <dbl>, reads_mapped <dbl>,
## # reads_mapped_and_paired <dbl>, reads_unmapped <dbl>,
## # reads_properly_paired <dbl>, reads_paired <dbl>, reads_duplicated <dbl>,
## # reads_MQ0 <dbl>, reads_QC_failed <dbl>, `non-primary_alignments` <dbl>,
## # total_length <dbl>, total_first_fragment_length <dbl>,
## # total_last_fragment_length <dbl>, bases_mapped <dbl>,
## # `bases_mapped_(cigar)` <dbl>, bases_trimmed <dbl>, …
p1 = report %>%
select(Sample, reads_unmapped, reads_mapped) %>%
mutate(Sample = str_replace(Sample, '-bamstats', '')) %>%
pivot_longer(cols = c("reads_unmapped", "reads_mapped"), names_to = "feature", values_to = "reads") %>%
mutate(feature = if_else(feature == "reads_unmapped", "Unmapped", feature)) %>%
mutate(feature = if_else(feature == "reads_mapped", "Mapped", feature)) %>%
ggplot(aes(x = reorder(Sample, reads), y = reads, fill = feature)) +
geom_col() +
coord_flip() +
theme_classic(base_size = 10) +
theme(legend.position = "top") +
scale_fill_brewer(palette = "Set2") +
xlab("") +
ylab("Number of reads") +
ggtitle("Library mapping rate")
p1
### Percent of mapped reads
p2 = report %>%
select(Sample, reads_unmapped_percent, reads_mapped_percent ) %>%
mutate(Sample = str_replace(Sample, '-bamstats', '')) %>%
pivot_longer(cols = c("reads_unmapped_percent", "reads_mapped_percent"), names_to = "feature", values_to = "reads") %>%
mutate(reads = reads / 100) %>%
mutate(feature = if_else(feature == "reads_unmapped_percent", "Unmapped", feature)) %>%
mutate(feature = if_else(feature == "reads_mapped_percent", "Mapped", feature)) %>%
ggplot(aes(x = reorder(Sample, reads), y = reads, fill = feature)) +
geom_col() +
coord_flip() +
theme_classic(base_size = 10) +
theme(legend.position = "top") +
scale_fill_brewer(palette = "Set2") +
scale_y_continuous(labels = scales::percent) +
xlab("") +
ylab("Percent of reads")
p2
# Plot together
p = (p1 | p2) + plot_layout(guides = "collect") & theme(legend.position = "bottom")
p
ggsave("figures/ITO66-mapping-qc.pdf", p, width = 8, height = 5)
# Match the sample ID's between samples
counts = counts[,match(metadata$Sample, colnames(counts))]
# Make DESeq2 object
ddsMat <- DESeqDataSetFromMatrix(countData = counts,
colData = metadata,
design = ~Group)
# Get normalize counts
ddsMat <- estimateSizeFactors(ddsMat)
# Filter flow counts
ddsMat.fil = ddsMat %>%
filter_low(percentile = 0.05)
# Get normalized counts
counts.norm.fil <- (counts(ddsMat.fil, normalized = TRUE) + 0.5) %>%
as.data.frame() %>%
rownames_to_column("oligo_id")
# Create a long table
counts.norm.fil.long = counts.norm.fil %>%
pivot_longer(cols = -oligo_id, names_to = "sampleId", values_to = "norm")
Bcells_alone_1 appears to be too noisy for inclusion as as replicate
# Bcell
counts.norm.fil %>%
ggplot(., aes(x = log10(Bcells_alone_1), y = log10(Bcells_alone_2))) +
geom_point(colour = alpha("grey", 0.7)) +
xlab("Bcell #1") +
ylab("Bcell #2") +
theme_classic(base_size = 9) +
stat_cor(method = "pearson", aes(label = ..r.label..)) +
stat_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula 'y ~ x'
counts.norm.fil %>%
ggplot(., aes(x = log10(Bcells_alone_1), y = log10(Bcells_cntrl))) +
geom_point(colour = alpha("grey", 0.7)) +
xlab("Bcell #1") +
ylab("Bcell-ctrl #1") +
theme_classic(base_size = 9) +
stat_cor(method = "pearson", aes(label = ..r.label..)) +
stat_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula 'y ~ x'
counts.norm.fil %>%
ggplot(., aes(x = log10(Bcells_alone_2), y = log10(Bcells_cntrl))) +
geom_point(colour = alpha("grey", 0.7)) +
xlab("Bcell #2") +
ylab("Bcell-ctrl #1") +
theme_classic(base_size = 9) +
stat_cor(method = "pearson", aes(label = ..r.label..)) +
stat_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula 'y ~ x'
# B & T-cells
counts.norm.fil %>%
ggplot(., aes(x = log10(Bcells_Tcells_1), y = log10(Bcells_Tcells_2))) +
geom_point(colour = alpha("grey", 0.7)) +
xlab("BT cells #1") +
ylab("BT cells #2") +
theme_classic(base_size = 9) +
stat_cor(method = "pearson", aes(label = ..r.label..)) +
stat_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula 'y ~ x'
counts.norm.fil %>%
ggplot(., aes(x = log10(Bcells_Tcells_1), y = log10(Bcells_Tcells_3))) +
geom_point(colour = alpha("grey", 0.7)) +
xlab("BT cells #1") +
ylab("BT cells #3") +
theme_classic(base_size = 9) +
stat_cor(method = "pearson", aes(label = ..r.label..)) +
stat_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula 'y ~ x'
counts.norm.fil %>%
ggplot(., aes(x = log10(Bcells_Tcells_2), y = log10(Bcells_Tcells_3))) +
geom_point(colour = alpha("grey", 0.7)) +
xlab("BT cells #2") +
ylab("BT cells #3") +
theme_classic(base_size = 9) +
stat_cor(method = "pearson", aes(label = ..r.label..)) +
stat_smooth(method = "lm", color = "red")
## `geom_smooth()` using formula 'y ~ x'
counts.norm.fil.long %>%
ggplot(aes(x = log10(norm))) +
geom_histogram(aes(y=..density..), binwidth=0.25, colour="black", fill="white") +
geom_density(alpha = 0.2, fill = "#FF6666") +
facet_wrap(sampleId~.) +
theme_light(base_size = 11) +
ylab("Density") +
xlab("Normalized reads per oligo (log10)")
# Keep samples of interest
ddsMat.fil = ddsMat[,which(colnames(assay(ddsMat)) %in% c("Bcells_alone_2", "Bcells_cntrl", "Bcells_Tcells_2", "Bcells_Tcells_1"))]
# Detect dropouts
dropout.res = compute_stats(ddsMat.fil, trt = "BTcell", ref = "Bcell", fitType = "parametric", lfcThreshold = 0.25, alpha = 0.05) %>%
mutate(log2FoldChange = round(log2FoldChange, 2)) %>%
mutate(log2FoldChangeShrunken = round(log2FoldChangeShrunken, 2))
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
##
## out of 1804 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0.25 (up) : 0, 0%
## LFC < -0.25 (down) : 2, 0.11%
## outliers [1] : 0, 0%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Significant oligos
dropout.res %>%
filter(padj < 0.05)
## oligo_id baseMean log2FoldChange lfcSE stat pvalue
## 1 ABT1_001 295.7037 -2.93 0.2468228 -10.861683 8.774198e-28
## 2 EEF1A1_001B 137.5964 -2.33 0.3282979 -6.342926 1.127208e-10
## padj log2FoldChangeShrunken
## 1 1.582865e-24 -1.44
## 2 1.016742e-07 -0.82
# Browse
dropout.res %>%
select(oligo_id, log2FoldChange, padj) %>%
DT::datatable(extensions = 'Buttons', options = list(
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print')
))
# Plot MA-plot
ito66.maplot = dropout.res %>%
ggplot(aes(x = log2(baseMean), y = log2FoldChange, label = oligo_id)) +
geom_point(colour = alpha("grey", 0.7)) +
geom_point(data = filter(dropout.res, oligo_id %in% c("ABT1_001")), size = 3, colour = "#efc383") +
geom_point(data = filter(dropout.res, oligo_id %in% c("EEF1A1_001B")), size = 3, colour = "#efc383") +
geom_text_repel(data = filter(dropout.res, oligo_id %in% c("ABT1_001", "EEF1A1_001B")), box.padding = 0.5, max.overlaps = Inf, size = 2.5) +
theme_classic() +
xlim(5, 10) +
ylim(-3, 2) +
xlab("Mean counts (log2)") +
ylab("Fold change log2(B&Tcells/Bcells)") +
ggtitle("ITO66")
ggplotly(ito66.maplot)
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices datasets utils methods
## [8] base
##
## other attached packages:
## [1] forcats_0.5.1 stringr_1.4.0
## [3] dplyr_1.0.9 purrr_0.3.4
## [5] readr_2.1.2 tidyr_1.2.0
## [7] tibble_3.1.7 tidyverse_1.3.1
## [9] ggpubr_0.4.0 plotly_4.10.0
## [11] ggplotify_0.1.0 readxl_1.4.0
## [13] DESeq2_1.34.0 SummarizedExperiment_1.24.0
## [15] Biobase_2.54.0 MatrixGenerics_1.6.0
## [17] matrixStats_0.62.0 GenomicRanges_1.46.1
## [19] GenomeInfoDb_1.30.1 IRanges_2.28.0
## [21] S4Vectors_0.32.4 BiocGenerics_0.40.0
## [23] ggrepel_0.9.1 ggplot2_3.3.6
## [25] ggsci_2.9 patchwork_1.1.1
## [27] knitr_1.39
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-3 ggsignif_0.6.3 ellipsis_0.3.2
## [4] XVector_0.34.0 fs_1.5.2 rstudioapi_0.13
## [7] farver_2.1.0 DT_0.25 bit64_4.0.5
## [10] AnnotationDbi_1.56.2 fansi_1.0.3 lubridate_1.8.0
## [13] xml2_1.3.3 splines_4.1.3 cachem_1.0.6
## [16] geneplotter_1.72.0 jsonlite_1.8.0 broom_0.8.0
## [19] annotate_1.72.0 dbplyr_2.1.1 png_0.1-7
## [22] BiocManager_1.30.18 compiler_4.1.3 httr_1.4.2
## [25] backports_1.4.1 assertthat_0.2.1 Matrix_1.4-0
## [28] fastmap_1.1.0 lazyeval_0.2.2 cli_3.3.0
## [31] htmltools_0.5.2 tools_4.1.3 gtable_0.3.0
## [34] glue_1.6.2 GenomeInfoDbData_1.2.7 Rcpp_1.0.8.3
## [37] carData_3.0-5 cellranger_1.1.0 jquerylib_0.1.4
## [40] vctrs_0.4.1 Biostrings_2.62.0 nlme_3.1-155
## [43] crosstalk_1.2.0 xfun_0.30 rvest_1.0.2
## [46] lifecycle_1.0.1 renv_0.15.4 rstatix_0.7.0
## [49] XML_3.99-0.10 zlibbioc_1.40.0 scales_1.2.0
## [52] hms_1.1.1 parallel_4.1.3 RColorBrewer_1.1-3
## [55] yaml_2.3.5 memoise_2.0.1 yulab.utils_0.0.5
## [58] sass_0.4.1 stringi_1.7.6 RSQLite_2.2.15
## [61] highr_0.9 genefilter_1.76.0 BiocParallel_1.28.3
## [64] rlang_1.0.2 pkgconfig_2.0.3 bitops_1.0-7
## [67] evaluate_0.15 lattice_0.20-45 labeling_0.4.2
## [70] htmlwidgets_1.5.4 bit_4.0.4 tidyselect_1.1.2
## [73] magrittr_2.0.3 R6_2.5.1 generics_0.1.2
## [76] DelayedArray_0.20.0 DBI_1.1.2 mgcv_1.8-39
## [79] pillar_1.7.0 haven_2.5.0 withr_2.5.0
## [82] survival_3.2-13 KEGGREST_1.34.0 abind_1.4-5
## [85] RCurl_1.98-1.6 modelr_0.1.8 crayon_1.5.1
## [88] car_3.0-13 utf8_1.2.2 tzdb_0.3.0
## [91] rmarkdown_2.14 locfit_1.5-9.6 grid_4.1.3
## [94] data.table_1.14.2 blob_1.2.3 reprex_2.0.1
## [97] digest_0.6.29 xtable_1.8-4 gridGraphics_0.5-1
## [100] munsell_0.5.0 viridisLite_0.4.0 bslib_0.3.1